Mini Seminar

STAT 745 - Statistical Graphics

Robert Sholl

Part 1: ggplot - gg =

Problems

  1. I’m not good at ggplot

    • I use base R for everything
  2. The documentation is obtuse

    • Press F1 on a ggplot function

    • I’ll wait

  3. ggplot is very good at what it does

Possible Solutions

  1. Get better at ggplot

    • This uses time I’d rather waste
  2. Make better documentation

    • This is better for everyone
  3. Get better at base R plotting

    • I’m trying
  4. Turn ggplot into base R?

Possible Solutions

  1. Get better at ggplot

    • This uses time I’d rather waste
  2. Make better documentation

    • This is better for everyone
  3. Get better at base R plotting

    • I’m trying
  4. Turn ggplot into base R?

Method

  • Record the syntax for ggplot2 objects


  • Determine their base R analogs


  • Create a series of User-Defined Functions (UDFs) to map ggplot2 syntax to base R

Record

Scatterplots

# define the ggplot object
ggplot(data, aes(x = x, y = y)) + 
  
  # fill coordinate points
  geom_point(aes(x = x, y = y, size = size), shape = shape, color = color) +
  
  # set x limits
  scale_x_discrete(limits = limits) + 
  
  # set y limits
  scale_y_continuous(limits = limits) + 
  
  # set titles / axis labels
  labs(title = title,
       subtitle = subtitle,
       x = x,
       y = y) + 
  
  # set theme
  theme_bw() + 
  
  # additional graphics
  theme(legend.position = "none")

Match

# define the plot object
plot(x = x, y = y,
     
     # set x limits
     xlim = scale_x_discrete,
     
     # set y limits
     ylim = scale_y_continuous(),
     
     # set titles / axis labels
     main = title, sub = subtitle,
     xlab = lab(x), ylab = lab(y),
     
     # point shape / size / color
     pch = shape,
     cex = size,
     col = color)

UDFs

  • Plot and ggplot are dynamic

    • They adjust to optional parameters being passed or left out
# perfectly valid inputs that return full responses
plot(x = x, y = y)

ggplot(data = data, aes(x = x, y = y)) +
  geom_point()

Optional Parameters

  • Every optional parameter is defaulted as NULL

  • I’ll use if (!is.null()) {} else {} statements to check

# generate the points
## check for graphical conditions pch/col/cex
if (!is.null(pch) & !is.null(col) & !is.null(cex)){
    
  # if present generate points with all graphical params
  points = geom_point(aes(x = x, y = y, size = cex), 
                      shape = pch, color = col)
}
  
# reduce as needed
else if (!is.null(cex) & !is.null(col)) {
    
  # color and size
  points = geom_point(aes(x = x, y = y, size = cex), color = col)
}

Base Scatter

# ya know what?
# *un-gg's your plot*
ungg_scatter = function(x,y,data,
                        pch=NULL,xlim=NULL,ylim=NULL,
                        main,sub=NULL,
                        xlab,ylab,
                        col=NULL,cex=NULL,theme_choice=NULL){
  
  # so this is a monolith of code btw #
  
  # imagine not having it tho
  require(ggplot2)
  
  # create the major ggplot object
  plt = ggplot(data, aes(x = x, y = y))
  
  # we remove the legend by default
  ## it was so much effort to make this conditional
  ### and i have thoroughly lost my mind doing this
  rm_legend = theme(legend.position = "none")
  
  # generate the points
  ## check for graphical conditions pch/col/cex
  if (!is.null(pch) & !is.null(col) & !is.null(cex)){
    
    # if present generate points with all graphical params
    points = geom_point(aes(x = x, y = y, size = cex), 
                        shape = pch, color = col)
    
  }
  
  # reduce as needed
  else if (!is.null(cex) & !is.null(col)) {
    
    # color and size
    points = geom_point(aes(x = x, y = y, size = cex), color = col)
    
  }
  
  else if (!is.null(col) & !is.null(pch)) {
    
    # color and type
    points = geom_point(aes(x = x, y = y), shape = pch, color = col)
    
  }
  
  else if (!is.null(col)) {
    
    # just color
    points = geom_point(aes(x = x, y = y), color = col)
    
  }
  
  else if (!is.null(cex)) {
    
    # size
    points = geom_point(aes(x = x, y = y, size = cex))
    
  }
  
  else if (!is.null(pch)) {
    
    # shape
    points = geom_point(aes(x = x, y = y), shape = pch)
    
  }
  
  else {
    
    # default to raw point plotting
    points = geom_point(aes(x = x, y = y))
    
  }
  
  # subtitle is the only optional label param
  if (!is.null(sub)) {
    
    # produce with subtitle
    plt_labels = labs(title = main,
                      subtitle = sub,
                      x = xlab,
                      y = ylab)
  }
  
  else {
    
    # produce without subtitle
    plt_labels = labs(title = main,
                      x = xlab,
                      y = ylab)
  }
  
  # limit checks, set scale for each axis
  if (!is.null(xlim) & !is.null(ylim)) {
    
    # if-else statement for x class #
    
    # if x is a continuous value
    if (class(x) == "numeric") {
      
      # set x limit as continuous based on xlim
      xlimit = scale_x_continuous(limits = xlim)
      
    }
    
    # if x is any other type
    else {
      
      # convert x to numeric
      ## this is a lazy safety for if x
      ### is a character or factor or o.w.
      x = as.numeric(x)
      
      # set x limit as discrete based on xlim
      xlimit = scale_x_discrete(limits = xlim)
      
    }
    
    # if-else statement for y class #
    
    # if y is a continuous value
    if (class(y) == "numeric") {
      
      # set y limit as continuous based on ylim
      ylimit = scale_y_continuous(limits = ylim)
      
    }
    
    # if y is any other type
    else {
      
      # convert y to numeric
      ## lazy safety again
      ### i know there's better ways
      #### but this function is already stupid
      ##### why are you seeking l33t code here
      y = as.numeric(y) # im literally just a girl
      
      # set y limit as discrete based on ylim
      ylimit = scale_y_discrete(limits = xlim)
      
    }
  }
  
  # this part sucked to write
  ## just a ton of checks for existing parameters
  ### it's shockingly less time complex to reference the major variables
  #### instead of the existence of the ones produced by the function
  if (!is.null(xlim) & !is.null(ylim) & !is.null(theme_choice)) {
    
    output_plot = plt + points + xlimit + ylimit + 
      plt_labels + theme_choice + rm_legend
    
  }
  
  else if (!is.null(theme_choice)) {
    
    output_plot = plt + points + plt_labels + 
      theme_choice + rm_legend
    
  }
  
  else {
    
    output_plot = plt + points + plt_labels + rm_legend
    
  }
  
  # output should be saved as an object so extra layers can be added
  
}

Base Scatter

plt = ungg_scatter(x = iris$Petal.Length,
                    y = iris$Sepal.Width,
                    data = iris,
                    main = "Iris Data",
                    xlab = "Petal Length",
                    ylab = "Sepal Width",
                    col = "skyblue",
                    cex = 1.2,
                    theme_choice = theme_bw())  

Base Scatter

Lines

  • Ideally we can show a regression line

  • All ggplot line proto objects include:

    • color, linewidth, linetype
# line through points
geom_line()

# horizontal line
geom_hline()

# vertical line
geom_vline()

# intercept + slope line
geom_abline()

I love lines

  • Base R has the same specifications, but it wraps it in two functions
# line through (or made from) points
points(x, y, type = "l", lwd = linewidth, lty = linetype, col = color)

# horizontal/vertical/abline
abline(a = intercept, b = slope,
       h = horizontal, v = vertical,
       lwd = linewidth, lty = linetype, col = color)

Line Proto Objects

ungg_lines = function(x=NULL,y=NULL,h=NULL,v=NULL,ab=NULL,
                      lcol=NULL,hcol=NULL,
                      vcol=NULL,abcol=NULL,
                      lwd,lty) {
  
  # i've learned a lot of ggplot doing this
  ## ironically
  require(ggplot2)
  
  # these if-else statements be like:
  
  # lizard
  
  # lizard
  
  # if x and y are provided
  if (!is.null(x) & !is.null(y)){ # lizard
    
    # if color is specified
    if (!is.null(lcol)) { # lizard
      
      # produce lines with color
      lns = geom_line(aes(x = x, y = y),
                      color = lcol,
                      linewidth = lwd[1],
                      linetype = lty[1])
      
    }
    
    # o.w. without color
    else { # lizard
      
      # lines with no color
      lns = geom_line(aes(x = x, y = y), 
                      linewidth = lwd[1],
                      linetype = lty[1])
      
    }
    
  }
  
  
  # if hlines are specified
  ## and a color is specified
  if (!is.null(h) & !is.null(hcol)) { # lizard
    
    # hline with color
    hlines = geom_hline(yintercept = h, 
                        color = hcol,
                        linewidth = lwd[2], 
                        linetype = lty[2])
    
  }
  
  # if its just an hline
  else if (!is.null(h)) { # lizard
    
    # give an hline
    hlines = geom_hline(yintercept = h,
                        linewidth = lwd[2],
                        linetype = lty[2])
    
  }
  
  # if hlines are specified
  ## and a color is specified
  if (!is.null(v) & !is.null(vcol)) { # lizard
    
    # vline with color
    vlines = geom_vline(xintercept = v,
                        color = vcol,
                        linewidth = lwd[3],
                        linetype = lty[3])
    
  }
  
  # if its just a vline
  else if (!is.null(v)) { # lizard
    
    # give a vline
    vlines = geom_vline(xintercept = v,
                        linewidth = lwd[3],
                        linetype = lty[3])
    
  }
  
  # if an abline is specified
  ## with a color
  if (!is.null(abline) & !is.null(abcol)) { # lizard
    
    # abline with color
    ablines = geom_abline(intercept = ab[1], 
                          slope = ab[2],
                          color = abcol,
                          linewidth = lwd[4],
                          linetype = lty[4])
    
  }
  
  # just an abline?
  else if (!is.null(abline)) { # lizard
    
    # no color for you
    ablines = geom_abline(intercept = ab[1],
                          slope = ab[2],
                          linewidth = lwd[4],
                          linetype = lty[4])
    
  }
  
  # ..............................  .:-===----::..  ................................
  # ............................. .-++==----:::::::. ...............................
  # ...........................  -+*+=--::::::::::::. ..............................
  # .....................      .-**+=--::::::.......:.     .........        ........
  # ..................   ..:--=*#*+==--:+*#*+-.......---:..        ....::....  .....
  # ...............   .:=+*######*+==-:-#%###+.......=+++++=-:....:--------:::.  ...
  # .............  .-=*#%%%#******++=--:=***+:......:++++++***++===--:::::::::::. ..
  # ...........  .-+#%%%%%##****++++++==--::::::::-=++++++++++++==-:::::::.::::::. .
  # .......... .-*#%@@@@%%##*****+++++++++++++++++++++++++++++++=-::::....::....::.
  # ......... :+*%@@@@@@@%%##****++++++++++++++++++++++++++++++++=-:::..:+**+-...:.
  # .......  -*#%@@@%%@@@@%%##***+++++++++++++++++++++++++++++++++=-::..+###%+...:.
  # ...... .-##%@@@%##%@@@@@%%#***++++++++++++++++++++++++++++++++===-:.-+**+:...:..
  # ..... .=#%@@@@@%##%%%%@@@%%##***+++++++++++++++========++===========-:::...:-. .
  # .... .=%@@@@@@@%%%%%%%%%%%%%%##**++++++++==============++===+===========---=+=.
  # .... =%@@@@@@@@@%%%%%%%%###%%%%##**+++++++========----------=--===========++=+=.
  # ... -%@@@@@@@@@@@%%%%%%############***++++++=======-----------------============
  # .. :%@@@@@@@@@@@@%%%%%%%%####****+*******++++++======-------------------========
  # . .*@@@@@@@@@@@@@%%%%%%%%%%###***+++++++++++++++++======-----------------======:
  # . -@@@@@@@@@%%%%%%%%%%%%%%######**++++++=======+++++++======----------========:
  #  .#@@@@@@@%%%%%#################****++++++==================================-. .
  #  -@@@@@@%%%%#####*********************++++++=======-------------===========:. ..
  # .*@@@@@%%%####********+******************++++++=======-------------======:.  ...
  # :%@@@@%%###******++++++++++++++++++******+++++++++====================-:.  .....
  # =@@@@%%###***++++++++++++++++++++++++++++++++++++++++++++=====++===-:.   .......
  # *@@@@%%##***+++++===================+++++++++++++++++++++++===-::..   ..........
  # +**####**+++++===========================================-:..      .............
  
  # # # # # # # # # # # # # # # # L I Z A R D # # # # # # # # # # # # # # # # # # #
  
  # line set ups for if theres x and y data specified
  ## everything is saved as a list bc gg proto objects cant be combined
  ### so you'll have to reference the list after the fact
  if (!is.null(x) & !is.null(y)){ # lizard
    
    if (!is.null(h) & !is.null(v) & !is.null(ab)){ # lizard
      
      output_lines = list(lines = lns,
                          hlines = hlines,
                          vlines = vlines,
                          ablines = ablines)
      
    }
    
    else if (!is.null(h) & !is.null(v)){ # lizard
      
      output_lines = list(lines = lns,
                          hlines = hlines,
                          vlines = vlines)
      
    }
    
    else if (!is.null(h) & !is.null(ab)){ # lizard
      
      output_lines = list(lines = lns,
                          hlines = hlines,
                          ablines = ablines)
      
    }
    
    else if (!is.null(v) & !is.null(ab)){ # lizard
      
      output_lines = list(lines = lns,
                          vlines = vlines,
                          ablines = ablines)
      
    }
    
    else if (!is.null(h)){ # lizard
      
      output_lines = list(lines = lns,
                          hlines = hlines)
      
    }
    
    else if (!is.null(v)){ # lizard
      
      output_lines = list(lines = lns,
                          vlines = vlines)
      
    }
    
    else if (!is.null(ab)){ # lizard
      
      output_lines = list(lines = lns,
                          ablines = ablines)
      
    }
    
  }
  
  # if no x and y data is present we restart the checks
  ## and save appropriately
  else { # lizard
    
    if (!is.null(h) & !is.null(v) & !is.null(ab)){ # lizard
      
      output_lines = list(hlines = hlines,
                          vlines = vlines,
                          ablines = ablines)
      
    }
    
    else if (!is.null(h) & !is.null(v)){ # lizard
      
      output_lines = list(hlines = hlines,
                          vlines = vlines)
      
    }
    
    else if (!is.null(h) & !is.null(ab)){ # lizard
      
      output_lines = list(hlines = hlines,
                          ablines = ablines)
      
    }
    
    else if (!is.null(v) & !is.null(ab)){ # lizard
      
      output_lines = list(vlines = vlines,
                          ablines = ablines)
      
    }
    
    else if (!is.null(h)){ # lizard
      
      output_lines = list(hlines = hlines)
      
    }
    
    else if (!is.null(v)){ # lizard
      
      output_lines = list(vlines = vlines)
      
    }
    
    else if (!is.null(ab)){ # lizard
      
      output_lines = list(ablines = ablines)
      
    }
    
  }
  
}

Scatter + Lines

m = lm(iris$Sepal.Width ~ iris$Petal.Length)

plt_lines =  ungg_lines(h = mean(iris$Sepal.Width),
                        v = mean(iris$Petal.Length),
                        ab = as.numeric(coef(m)), 
                        hcol = "gold",
                        vcol = "orange",
                        abcol = "green",
                        lwd = c(1,1.5,1.5,1),
                        lty = c(1,3,3,1))

Scatter + Lines

plt + plt_lines$hlines + plt_lines$vlines + plt_lines$ablines

Part 2: Base R = Good

Jitter

  • Base R plots force you to think

    • Base R in general
  • I thought jitter was a tough task in base R

    • jitter() exists

Original Plot

Plot Code

# as.numeric(as.factor()) is my lazy trick
## to make characters into integers
### i know there's a better way 
#### i just can't bring myself to care
x = as.numeric(as.factor(df$pt))

# reproducibility seed
set.seed(73)

# puts a uniform distributed distance
## on the values of x where each uniform
### has min = -0.25 and max 0.25
jit_x = jitter(x) # the documentation was weird

# initialize drug names as numerics
n_drug = sort(as.numeric(as.factor(df$Drug)))

# log relative risk
log_RR = df$log_RR

# color scheme from scale_shape_manual
pnt_colors = c("#CC9900", "#999900", "#66CC33", 
               "#33CC66", "#00CC99", "#00CCCC", 
               "#00CCFF", "#33CCFF", "#6699FF", 
               "#9966FF", "#CC66CC", "#CC3399", 
               "#FF6699", "#FF66CC")

# control margins of graph
## bottom, left, top, right
par(mar = c(10,4,0.5,8)) # (i always forget)

plot(jit_x,log_RR,
     pch = 2:15, # shapes
     col = pnt_colors, # colors
     xaxt = "n", # no x axis ticks
     xlab = "", # no x axis label
     ylab = "log Relative Risk", # y axis label
     cex = 1.3) # point size

# build the grid
abline(h = seq(0,3,1),col="#D1D1D190") # horizontal
abline(v = seq(1,13,1),col="#D1D1D190") # vertical

# x axis ticks with character labels
axis(1, seq(1,13,1), labels = sort(unique(df$pt)),
     las = 2, cex.axis = 0.75)

# x axis label
mtext("Pulmonary Adverse Events", 1, 9)

# legend set to the right
legend("right", 
  legend = sort(unique(df$Drug)), # drug names
  col = pnt_colors, # colors
  pch = 2:15, # shapes
  bty = "n", # prevent legend box
  pt.cex = 1, # point size
  cex = 0.8, # text size
  text.col = "black", 
  horiz = F, # prevent horizontal legend
  inset = c(-.2, .8), # hjust / vjust
  xpd = T) # required to set it outside of the plot box

Plot

Questions?